Note: Grading is based both on your graphs and verbal explanations. Follow all best practices as discussed in class, including choosing appropriate parameters for all graphs. Do not expect the assignment questions to spell out precisely how the graphs should be drawn. Sometimes guidance will be provided, but the absense of guidance does not mean that all choices are ok.
Read Graphical Data Analysis with R, Ch. 4, 5
[5 points]
Data: ames in the openintro package
library(tidyverse)
library(openintro)
ggplot(ames, aes(fct_infreq(Roof.Style))) +
geom_bar() +
scale_y_continuous(expand = expansion(mult = c(0, .05))) +
xlab("Roof Style")
ggplot(ames, aes(factor(month.abb[Mo.Sold], levels = month.abb))) +
geom_bar() +
scale_y_continuous(expand = expansion(mult = c(0, .05))) +
xlab("")
"Ex" "Fa" "Gd" "Po" "TA" as levels.myvars <- map(select(ames, where(is.factor)), levels) %>%
map_df(paste, collapse = " ") %>%
pivot_longer(everything()) %>%
mutate(value = trimws(value)) %>%
filter(value == "Ex Fa Gd Po TA") %>%
pull(name)
myvars
## [1] "Exter.Cond" "Bsmt.Qual" "Bsmt.Cond" "Heating.QC" "Kitchen.Qual"
## [6] "Fireplace.Qu" "Garage.Qual" "Garage.Cond"
facet_wrap() to display the frequency distribution of all variables from part c). (Hint: transform the data first with pivot_longer())ames %>%
select(all_of(myvars)) %>%
pivot_longer(everything()) %>%
mutate(value = na_if(value, "")) %>%
mutate(value = fct_recode(value, "Excellent" = "Ex", "Good" = "Gd",
"Typical" = "TA", "Fair" = "Fa", "Poor" = "Po")) %>%
mutate(value = fct_relevel(value, "Poor", "Fair", "Typical", "Good", "Excellent")) %>%
ggplot(aes(value)) +
geom_bar() +
facet_wrap(~name, ncol = 2)
[10 points]
Data: seattlepets in the openintro package
pets <- seattlepets
pets %>%
filter(species == "Dog") %>%
group_by(animal_name) %>%
count() %>%
ungroup() %>%
slice_max(order_by = n, n = 30) %>%
ggplot(aes(n, reorder(animal_name, n))) +
geom_point() +
ggtitle("Top Dog Names in Seattle") +
theme_linedraw()
pets %>%
filter(species == "Cat") %>%
group_by(animal_name) %>%
count() %>%
ungroup() %>%
slice_max(order_by = n, n = 30) %>%
mutate(animal_name = fct_reorder(animal_name, n)) %>%
mutate(animal_name = fct_explicit_na(animal_name, "NA")) %>%
mutate(animal_name = fct_relevel(animal_name, "NA", )) %>%
ggplot(aes(n, animal_name)) +
geom_point() +
ggtitle("Top Cat Names in Seattle") +
theme_linedraw()
propdogs <- pets %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(animal_name, species) %>%
summarize(n = n()) %>%
ungroup() %>%
pivot_wider(names_from = "species", values_from = "n") %>%
mutate(cat = replace_na(Cat, 0), dog = replace_na(Dog, 0), .keep = "unused") %>%
mutate(total = dog + cat) %>%
mutate(prop = dog / total)
propdogs %>%
filter(total >= 25) %>%
slice_max(order_by = prop, n = 30) %>%
ggplot(aes(x = prop, y = reorder(animal_name, prop))) +
geom_point() +
ggtitle("Animal names that say 'dog'",
sub = "(among names with counts of at least 25)") +
ylab("") +
theme_linedraw()
We used a cutoff of 25 for the names that we choose to include. 25 is somewhat arbitrary – the main point is not to include those with a very small number of occurrences in the dataset.
pets <- seattlepets
top30names <- pets %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(animal_name) %>%
summarize(n = n()) %>%
slice_max(order_by = n, n = 30) %>%
pull(animal_name)
top30byspecies <- pets %>%
filter(animal_name %in% top30names) %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(animal_name, species) %>%
summarize(count = n()) %>%
ungroup()
top30total <- top30byspecies %>%
group_by(animal_name) %>%
summarize(count = sum(count)) %>%
mutate(species = "Total")
top30 <- bind_rows(top30byspecies, top30total) %>%
mutate(animal_name = fct_reorder2(animal_name, species == "Total", -count)) %>%
mutate(animal_name = fct_explicit_na(animal_name, "NA")) %>%
mutate(animal_name = fct_relevel(animal_name, "NA"))
ggplot(top30, aes(x = count, y = animal_name, color = species)) +
geom_point() +
ggtitle("Top 30 Pet Names Overall", sub = "dogs and cats combined") +
ylab("") +
theme_linedraw() +
theme(legend.position = "top")
scatterpets <- pets %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(animal_name, species) %>%
summarize(count = n()) %>%
ungroup() %>%
pivot_wider(names_from = "species", values_from = "count") %>%
mutate(Cat = replace_na(Cat, 0), Dog = replace_na(Dog, 0))
cat2dogratio = sum(scatterpets$Cat)/sum(scatterpets$Dog)
g <- scatterpets %>%
filter(!is.na(animal_name)) %>%
ggplot(aes(Dog, Cat)) +
geom_point(size = .5) +
geom_abline(slope = cat2dogratio, intercept = 0) +
ggtitle("Cat vs. Dog Names", subtitle = "(the line represents cat to dog name ratios equal to that of the full population)") +
xlab("Number of Dogs") +
ylab("Number of Cats") +
theme_linedraw()
g
Solution
The graph above shows the counts of cat names plotted against the counts of dog names. (I have removed the NA value so it’s easier to see the majority of the data). Points that fall on the line above represent neutral names, as the slope of the line is equal to the ratio overall of cats to dogs in the dataset. Points that fall above the line are more “cat” names; points that fall below the line are more “dog” names. Let’s look at a small section of the graph:
g + geom_text(aes(label = ifelse(Dog == 32, animal_name, "")),
size = 2.5, hjust = 0, nudge_x = .1) +
scale_x_continuous(limits = c(30, 40), breaks = 30:40) +
scale_y_continuous(limits = c(0, 40))
We see that Cosmo is a neutral name. There are 16 cats and 32 dogs with the name; the ratio is very close to the overall percentage of cats to dogs in the dataset: 0.4915722. The name above the line, Callie, is more “cat,” while the names below the line–Lexi, Finnegan, Bo, Kuma, Roxie, and Dixie–being more “dog.” Of those, Dixie is the “doggiest”: there are 32 dogs but only 2 cats named Dixie.
Since there are very few names that fall as close to the line as Cosmo, we will create a buffer around the line to create a neutral name territory.
After experimenting with a number of options, I found that adding lines which create angles of .12 radians (about 6.9 degrees) on either side of the original diagonal and adding or subtracting a small intercept creates a reasonable looking neutral territory.
angle <- atan(cat2dogratio)
catslope <- tan(angle + .12)
dogslope <- tan(angle - .12)
catintercept <- 3/cos(angle+.12)
dogintercept <- -3/cos(angle-.12)
g +
geom_abline(slope = catslope, intercept = catintercept) +
geom_abline(slope = dogslope, intercept = dogintercept) +
coord_fixed() +
scale_y_continuous(limits = c(0, 200))
Next we need to determine the region for each point and label some points. Labeling names with at least 35 cats or 100 dogs captured most of the very popular names without overlapping labels.
library(ggrepel)
scatterpets <- scatterpets %>%
mutate(name_type = case_when(
Cat > catslope*Dog + catintercept ~ "Cat",
Cat < dogslope*Dog + dogintercept ~ "Dog",
TRUE ~ "Neutral"
)) %>%
mutate(name_type = fct_relevel(name_type, "Cat", "Neutral", "Dog"))
linecolor <- "grey50"
scatterpets %>%
filter(!is.na(animal_name)) %>%
mutate(label = ifelse(Cat > 35 | Dog > 100, animal_name, "")) %>%
ggplot(aes(Dog, Cat, color = name_type, label = label)) +
geom_point(size = .5) +
geom_abline(slope = catslope, intercept = catintercept, color = linecolor) +
geom_abline(slope = dogslope, intercept = dogintercept, color = linecolor) +
geom_text_repel(size = 2.5, nudge_y = 10, show.legend = FALSE) +
scale_y_continuous(limits = c(0, 200)) +
ggtitle("Cat vs. Dog Names") +
xlab("Number of Dogs") +
ylab("Number of Cats") +
theme_linedraw() +
theme(legend.position = "top")
Alternative Solution
One way to think about how much a name is a “dog” name is to consider the probability of getting the observed proportion of dogs for a given name if there were no connection between name and species. For example, there are 126 pets named “Riley.” Since 67.04% of all of the pets are dogs we would expect about .6704 * 126 = 84.47 of the Rileys to be dogs. However the actually number of dog Rileys is 117. We can set up a hypothesis test to find the probability of getting a result this far (or more) from what’s expected.
We set up a null hypothesis: \(p = .6704\) and then find \(P(\hat{p} > 117/126)\) (and then double it to find the p-value for a two-sided test). We can use binom.test() to get the p-value directly:
binom.test(117, 126, .6704)
##
## Exact binomial test
##
## data: 117 and 126
## number of successes = 117, number of trials = 126, p-value = 6.623e-12
## alternative hypothesis: true probability of success is not equal to 0.6704
## 95 percent confidence interval:
## 0.8687359 0.9668197
## sample estimates:
## probability of success
## 0.9285714
Since the p-value is very small it’s very unlikely that just by chance a lot of dogs are named Riley. Using this method, I found the 30 names with the lowest p-values, in other words, the ones that provide the most evidence against the null hypothesis that a cat or dog is equally likely to have that name.
For this graph, we labeled names with the lowest p-values, that is, the most “doggiest” and “cattiest” names:
bt <- function(cat, dog) binom.test(x = dog, n = cat + dog, p = .67)$p.value
scatterpets <- pets %>%
filter(!is.na(animal_name)) %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(animal_name, species) %>%
summarize(count = n()) %>%
ungroup() %>%
pivot_wider(names_from = "species", values_from = "count") %>%
mutate(Cat = replace_na(Cat, 0), Dog = replace_na(Dog, 0)) %>%
mutate(animal_name = replace_na(animal_name, "NA")) %>%
mutate(pvalue = map2_dbl(Cat, Dog, bt)) %>%
mutate(Type = case_when(pvalue <= .05 & Dog > 2*Cat ~ "Dog",
pvalue <= .05 & 2*Cat > Dog ~ "Cat",
TRUE ~ "Neutral")) %>%
mutate(Type = fct_relevel(Type, "Cat", "Neutral", "Dog"))
scatterpets %>%
mutate(label = ifelse(pvalue <= .00001, animal_name, "")) %>%
ggplot(aes(Dog, Cat, label = label, color = Type)) +
geom_point(size = .5) +
geom_text(size = 2.5, nudge_y = 10, show.legend = FALSE) +
ggtitle("Cat vs. Dog Names") +
xlab("Number of Dogs") +
ylab("Number of Cats") +
theme_linedraw() +
theme(legend.position = "top")
And a closer look at names with fewer than 25 cats and fewer than 25 dogs:
scatterpets %>%
mutate(label = ifelse(Cat > 18, animal_name, "")) %>%
ggplot(aes(Dog, Cat, label = label, color = Type)) +
geom_point() +
geom_text(size = 2.5, nudge_x = .5, hjust = 0, show.legend = FALSE) +
coord_equal() +
scale_x_continuous(limits = c(0, 25)) +
scale_y_continuous(limits = c(0, 25)) +
ggtitle("Cat vs. Dog Names") +
xlab("Number of Dogs") +
ylab("Number of Cats") +
theme_linedraw()
Note the skew toward cat names for low frequency names. This makes sense there are half as many cats in the dataset as dogs.
Many more cats are missing names than dogs: NA is the most popular cat “name” whereas NA does not make the top 30 for dogs. It’s not clear if the cats labeled NA in the dataset don’t have names or if their names are missing for some reason(s).
There are about twice as many dogs as cats. Since the helpfile for the dataset describes it as “Names of registered pets in Seattle, WA, between 2003 and 2018, provided by the city’s Open Data Portal” we can infer that this is the actual ratio of dogs to cats as pets in Seattle during this period.
Even taking into consideration that there are more dogs, the very popular dog names are more common proportionally than the very popular cat names. For example, 9.6 of every 1000 dogs are named Lucy (the most popular dog name) whereas only 6.6 of every 1000 cats are named Luna (the most pouplar cat name).
To a somewhat surprising degree, most names are given both to cats and dogs. Lucy is the #1 dog name AND the #2 cat name. Other very popular names given to dogs and cats include: Charlie, Luna, Bella, Max, Daisy, Molly, Jack, Lily and Stella. All of these names appear at least 225 times in the dataset.
In contrast, the maximum number of dogs with a single dog-only name is 16. Kodiak, Odio, and Oso are tied for first place in this category. Similarly, after NA the most popular cat-only name is Mittens, given to 19 cats.
[7 points]
Data: ames in the openintro package
For all, adjust parameters to the levels that provide the best views of the data.
Draw four plots of price vs. area with the following variations:
alpha.ggplot(ames, aes(area, price/1000)) +
geom_point(size = 1, alpha = .5, stroke = 0) +
ylab("price (in 1000s of U.S. $)") +
theme_classic()
ggplot(ames, aes(area, price/1000)) +
geom_point(alpha = .5, stroke = 0, color = "grey70") +
geom_density_2d() +
ylab("price (in 1000s of U.S. $)") +
theme_classic(14)
# RColorBrewer::brewer.pal(9, "Blues")
# [1] "#F7FBFF" "#DEEBF7" "#C6DBEF" "#9ECAE1" "#6BAED6" "#4292C6" "#2171B5" "#08519C" "#08306B"
ggplot(ames, aes(area, price/1000)) +
geom_hex(bins = 50) +
scale_fill_gradient(low = "#DEEBF7", high = "#08519C") +
ylab("price (in 1000s of U.S. $)") +
theme_classic()
ggplot(ames, aes(area, price/1000)) +
geom_bin2d(bins = 50) +
scale_fill_gradient(low = "#DEEBF7", high = "#08519C") +
ylab("price (in 1000s of U.S. $)") +
theme_classic(14)
There are three clear outliers with high area–around 5000 square feet–and low prices (around $200k).
There are two outliers in both dimensions: high area and high price: area > 4000 square feet and price > $700k.
There is a collection of mid/large sized houses–between 2500 and 3000 square feet– with very high prices. They form a ring at the $600k price line.
There are no small area houses with high prices.
Although there’s a lot of overplotting, it is possible to see from the heatmaps and plot with density contour lines that houses with areas between 1000 and 1500 square feet with prices between $100k and $200k occur the most often in the dataset.
The price variance increases as area increases. For example, houses with areas around 3000 square feet had prices between about $200k and $600k, quite a large range.
There appears to be a straight line border with a positive slope above the collection of points in the scatterplot. It serves as an upper limit of price for a particular area. For example, no houses with area of 1000 square feet had a price above $200k and no houses with an area of 2000 square feet had a price above $400k. There is no comparable boundary on the lower limit of price. In other words, there are some large houses with quite low prices.
[8 points]
Data: ames in the openintro package
price vs. area) this time faceting on Neighborhood (use facet_wrap(). Add best fitting lines and sort the facets by the slope of the best fitting line from low to high. (Use lm() to get the slopes.)cor_df <- ames %>%
group_by(Neighborhood) %>%
summarize(cor = cor(area, price), beta = lm(price~area)$coefficients[2], meanprice = mean(price)) %>%
ungroup() %>%
arrange(beta)
ames %>%
ggplot(aes(area/1000, price/1000)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point(size = .15) +
facet_wrap(~fct_relevel(Neighborhood, as.character(cor_df$Neighborhood))) +
xlab("living area (in 1000s of square feet)") +
ylab("price (in 1000s of U.S. $)") +
theme_linedraw()
ggplot(cor_df, aes(meanprice, beta, label = Neighborhood)) +
geom_point() +
geom_text(nudge_y = 10, size = 3) +
geom_smooth(method="lm", se= FALSE)
It seems that there are higher slopes in neighborhoods with higher mean housing prices, since this graph has a positive slope.
price on area by Neighborhood. Is the \(R^2\) higher in neighborhoods with higher mean housing prices? Are the results the same for slope and \(R^2\)? Explain using examples from the graphs.r2_df <- cor_df %>% arrange(cor^2)
ames %>%
ggplot(aes(area/1000, price/1000)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point(size = .15) +
facet_wrap(~fct_relevel(Neighborhood, as.character(r2_df$Neighborhood))) +
xlab("living area (in 1000s of square feet)") +
ylab("price (in 1000s of U.S. $)") +
theme_linedraw()
r2_df$rsq = cor_df$cor^2
ggplot(r2_df, aes(meanprice, rsq, label = Neighborhood)) +
geom_point() +
geom_text(nudge_y = .04, size = 3) +
geom_smooth(method="lm", se= FALSE)
It seems that the \(R^2\) is higher in neighborhoods with higher mean housing prices, since this graph has a positive slope.
The results are not the same for slope and \(R^2\). For example, Crawford has a high slope but low \(R^2\). This is because there is a lot of variance of price for similar areas.